#include <linux/mman.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <asm/hypervisor.h>
static struct proc_dir_entry *balloon_pde;
unsigned long credit;
-static unsigned long current_pages, max_pages;
+static unsigned long current_pages, most_seen_pages;
static inline pte_t *get_ptep(unsigned long addr)
{
parray = (unsigned long *)vmalloc(num_pages * sizeof(unsigned long));
if ( parray == NULL )
{
- printk("inflate_balloon: Unable to vmalloc parray\n");
- return 0;
+ printk(KERN_ERR "inflate_balloon: Unable to vmalloc parray\n");
+ return -EFAULT;
}
currp = parray;
- for ( i = 0; i < num_pages; i++ )
+ for ( i = 0; i < num_pages; i++, currp++ )
{
- /* NB. Should be GFP_ATOMIC for a less aggressive inflation. */
- vaddr = __get_free_page(GFP_KERNEL);
+ struct page *page = alloc_page(GFP_HIGHUSER);
+ unsigned long pfn = page - mem_map;
/* If allocation fails then free all reserved pages. */
- if ( vaddr == 0 )
+ if ( page == 0 )
{
- printk("Unable to inflate balloon by %ld, only %ld pages free.",
+ printk(KERN_ERR "Unable to inflate balloon by %ld, only %ld pages free.",
num_pages, i);
currp = parray;
- for(j = 0; j < i; j++){
- free_page(*currp++);
+ for(j = 0; j < i; j++, ++currp){
+ __free_page((struct page *) (mem_map + *currp));
}
+ ret = -EFAULT;
goto cleanup;
}
- *currp++ = vaddr;
+ *currp = pfn;
}
- currp = parray;
- for ( i = 0; i < num_pages; i++ )
+ for ( i = 0, currp = parray; i < num_pages; i++, currp++ )
{
- curraddr = *currp;
- *currp = virt_to_machine(*currp) >> PAGE_SHIFT;
- queue_l1_entry_update(get_ptep(curraddr), 0);
- phys_to_machine_mapping[__pa(curraddr) >> PAGE_SHIFT] = DEAD;
- currp++;
+ unsigned long mfn = phys_to_machine_mapping[*currp];
+ curraddr = page_address(mem_map + *currp);
+ if (curraddr)
+ queue_l1_entry_update(get_ptep(curraddr), 0);
+
+ phys_to_machine_mapping[*currp] = DEAD;
+ *currp = mfn;
}
XEN_flush_page_update_queue();
parray, num_pages, 0);
if ( unlikely(ret != num_pages) )
{
- printk("Unable to inflate balloon, error %lx\n", ret);
+ printk(KERN_ERR "Unable to inflate balloon, error %lx\n", ret);
goto cleanup;
}
* phys->machine mapping table looking for DEAD entries and populates
* them.
*/
-static unsigned long process_new_pages(unsigned long * parray,
+static unsigned long process_returned_pages(unsigned long * parray,
unsigned long num)
{
/* currently, this function is rather simplistic as
* incorporated here.
*/
- unsigned long tot_pages = start_info.nr_pages;
+ unsigned long tot_pages = most_seen_pages;
unsigned long * curr = parray;
unsigned long num_installed;
unsigned long i;
{
phys_to_machine_mapping[i] = *curr;
queue_machphys_update(*curr, i);
- queue_l1_entry_update(
+ if (i<max_low_pfn)
+ queue_l1_entry_update(
get_ptep((unsigned long)__va(i << PAGE_SHIFT)),
((*curr) << PAGE_SHIFT) | pgprot_val(PAGE_KERNEL));
- *curr = (unsigned long)__va(i << PAGE_SHIFT);
+ __free_page(mem_map + i);
+
curr++;
num_installed++;
}
}
- /*
- * This is tricky (and will also change for machine addrs that
- * are mapped to not previously released addresses). We free pages
- * that were allocated by get_free_page (the mappings are different
- * now, of course).
- */
- curr = parray;
- for ( i = 0; i < num_installed; i++ )
- {
- free_page(*curr);
- curr++;
- }
-
return num_installed;
}
if ( num_pages > credit )
{
- printk("Can not allocate more pages than previously released.\n");
+ printk(KERN_ERR "deflate_balloon: %d pages > %d credit.\n",
+ num_pages, credit);
return -EAGAIN;
}
parray = (unsigned long *)vmalloc(num_pages * sizeof(unsigned long));
if ( parray == NULL )
{
- printk("inflate_balloon: Unable to vmalloc parray\n");
+ printk(KERN_ERR "deflate_balloon: Unable to vmalloc parray\n");
return 0;
}
parray, num_pages, 0);
if ( unlikely(ret != num_pages) )
{
- printk("Unable to deflate balloon, error %lx\n", ret);
+ printk(KERN_ERR "deflate_balloon: xen increase_reservation err %lx\n",
+ ret);
goto cleanup;
}
- if ( (ret = process_new_pages(parray, num_pages)) < num_pages )
+ if ( (ret = process_returned_pages(parray, num_pages)) < num_pages )
{
- printk("Unable to deflate balloon by specified %lx pages, only %lx.\n",
- num_pages, ret);
+ printk(KERN_WARNING
+ "deflate_balloon: restored only %lx of %lx pages.\n",
+ ret, num_pages);
goto cleanup;
}
#define PAGE_TO_MB_SHIFT 8
+/*
+ * pagetable_extend() mimics pagetable_init() from arch/xen/mm/init.c
+ * The loops do go through all of low memory (ZONE_NORMAL). The
+ * old pages have _PAGE_PRESENT set and so get skipped.
+ * If low memory is not full, the new pages are used to fill it, going
+ * from cur_low_pfn to low_pfn. high memory is not direct mapped so
+ * no extension is needed for new high memory.
+ */
+
+static void pagetable_extend (int cur_low_pfn, int newpages)
+{
+ unsigned long vaddr, end;
+ pgd_t *kpgd, *pgd, *pgd_base;
+ int i, j, k;
+ pmd_t *kpmd, *pmd;
+ pte_t *kpte, *pte, *pte_base;
+ int low_pfn = min(cur_low_pfn+newpages,(int)max_low_pfn);
+
+ /*
+ * This can be zero as well - no problem, in that case we exit
+ * the loops anyway due to the PTRS_PER_* conditions.
+ */
+ end = (unsigned long)__va(low_pfn*PAGE_SIZE);
+
+ pgd_base = init_mm.pgd;
+ i = __pgd_offset(PAGE_OFFSET);
+ pgd = pgd_base + i;
+
+ for (; i < PTRS_PER_PGD; pgd++, i++) {
+ vaddr = i*PGDIR_SIZE;
+ if (end && (vaddr >= end))
+ break;
+ pmd = (pmd_t *)pgd;
+ for (j = 0; j < PTRS_PER_PMD; pmd++, j++) {
+ vaddr = i*PGDIR_SIZE + j*PMD_SIZE;
+ if (end && (vaddr >= end))
+ break;
+
+ /* Filled in for us already? */
+ if ( pmd_val(*pmd) & _PAGE_PRESENT )
+ continue;
+
+ pte_base = pte = (pte_t *) __get_free_page(GFP_KERNEL);
+
+ for (k = 0; k < PTRS_PER_PTE; pte++, k++) {
+ vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE;
+ if (end && (vaddr >= end))
+ break;
+ *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
+ }
+ kpgd = pgd_offset_k((unsigned long)pte_base);
+ kpmd = pmd_offset(kpgd, (unsigned long)pte_base);
+ kpte = pte_offset(kpmd, (unsigned long)pte_base);
+ queue_l1_entry_update(kpte,
+ (*(unsigned long *)kpte)&~_PAGE_RW);
+ set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
+ XEN_flush_page_update_queue();
+ }
+ }
+}
+
+/*
+ * claim_new_pages() asks xen to increase this domain's memory reservation
+ * and return a list of the new pages of memory. This new pages are
+ * added to the free list of the memory manager.
+ *
+ * Available RAM does not normally change while Linux runs. To make this work,
+ * the linux mem= boottime command line param must say how big memory could
+ * possibly grow. Then setup_arch() in arch/xen/kernel/setup.c
+ * sets max_pfn, max_low_pfn and the zones according to
+ * this max memory size. The page tables themselves can only be
+ * extended after xen has assigned new pages to this domain.
+ */
+
+static unsigned long
+claim_new_pages(unsigned long num_pages)
+{
+ unsigned long new_page_cnt, pfn;
+ unsigned long * parray, *curr;
+
+ if (most_seen_pages+num_pages> max_pfn)
+ num_pages = max_pfn-most_seen_pages;
+ if (num_pages==0) return 0;
+
+ parray = (unsigned long *)vmalloc(num_pages * sizeof(unsigned long));
+ if ( parray == NULL )
+ {
+ printk(KERN_ERR "claim_new_pages: Unable to vmalloc parray\n");
+ return 0;
+ }
+
+ XEN_flush_page_update_queue();
+ new_page_cnt = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+ parray, num_pages, 0);
+ if (new_page_cnt != num_pages)
+ {
+ printk(KERN_WARNING
+ "claim_new_pages: xen granted only %lu of %lu requested pages\n",
+ new_page_cnt, num_pages);
+
+ /* XXX
+ * avoid xen lockup when user forgot to setdomainmaxmem. xen
+ * usually can dribble out a few pages and then hangs
+ */
+ if (new_page_cnt < 1000) {
+ printk(KERN_WARNING "Remember to use setdomainmaxmem\n");
+ HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation,
+ parray, new_page_cnt, 0);
+ return -EFAULT;
+ }
+ }
+ memcpy(phys_to_machine_mapping+most_seen_pages, parray,
+ new_page_cnt * sizeof(unsigned long));
+
+ pagetable_extend(most_seen_pages,new_page_cnt);
+
+ for (pfn = most_seen_pages, curr = parray;
+ pfn < most_seen_pages+new_page_cnt;
+ pfn++, curr++ )
+ {
+ struct page *page = mem_map + pfn;
+
+#ifndef CONFIG_HIGHMEM
+ if (pfn>=max_low_pfn) {
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+ pfn>>PAGE_TO_MB_SHIFT);
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ break;
+ }
+#endif
+ queue_machphys_update(*curr, pfn);
+ XEN_flush_page_update_queue();
+ if (pfn<max_low_pfn) {
+ queue_l1_entry_update(get_ptep((unsigned long)__va(pfn << PAGE_SHIFT)),
+ ((*curr) << PAGE_SHIFT) | pgprot_val(PAGE_KERNEL));
+ XEN_flush_page_update_queue();
+ }
+
+ /* this next bit mimics arch/xen/mm/init.c:one_highpage_init() */
+ ClearPageReserved(page);
+ if (pfn>=max_low_pfn) set_bit(PG_highmem, &page->flags);
+ set_page_count(page, 1);
+ __free_page(page);
+ }
+
+ vfree(parray);
+
+ return new_page_cnt;
+}
+
static int balloon_write(struct file *file, const char *buffer,
u_long count, void *data)
{
char memstring[64], *endchar;
int len, i;
- unsigned long pages;
- unsigned long long target;
+ unsigned long target;
+ unsigned long long targetbytes;
/* Only admin can play with the balloon :) */
if ( !capable(CAP_SYS_ADMIN) )
return -EPERM;
if (count>sizeof memstring) {
- return -EFBIG;
+ return -EFBIG;
}
len = strnlen_user(buffer, count);
endchar = memstring;
for(i=0; i<len; ++i,++endchar) {
- if ('0'>memstring[i] || memstring[i]>'9') break;
+ if ('0'>memstring[i] || memstring[i]>'9') break;
}
if (i==0) return -EBADMSG;
- target = memparse(memstring,&endchar);
- pages = target >> PAGE_SHIFT;
+ targetbytes = memparse(memstring,&endchar);
+ target = targetbytes >> PAGE_SHIFT;
- if (pages < current_pages) {
- int change = inflate_balloon(current_pages-pages);
- if (change<0) return change;
+ if (target < current_pages) {
+ int change = inflate_balloon(current_pages-target);
+ if (change<=0) return change;
- current_pages -= change;
- printk("Relinquish %dMB to xen. Domain now has %ldMB\n",
- change>>PAGE_TO_MB_SHIFT, current_pages>>PAGE_TO_MB_SHIFT);
+ current_pages -= change;
+ printk(KERN_INFO "Relinquish %dMB to xen. Domain now has %luMB\n",
+ change>>PAGE_TO_MB_SHIFT, current_pages>>PAGE_TO_MB_SHIFT);
}
- else if (pages > current_pages) {
- int change = deflate_balloon(min(pages,max_pages) - current_pages);
- if (change<0) return change;
+ else if (target > current_pages) {
+ int change, reclaim = min(target,most_seen_pages) - current_pages;
+
+ if (reclaim) {
+ change = deflate_balloon( reclaim);
+ if (change<=0) return change;
+ current_pages += change;
+ printk(KERN_INFO "Reclaim %dMB from xen. Domain now has %luMB\n",
+ change>>PAGE_TO_MB_SHIFT, current_pages>>PAGE_TO_MB_SHIFT);
+ }
- current_pages += change;
- printk("Reclaim %dMB from xen. Domain now has %ldMB\n",
- change>>PAGE_TO_MB_SHIFT, current_pages>>PAGE_TO_MB_SHIFT);
+ if (most_seen_pages<target) {
+ int growth = claim_new_pages(target-most_seen_pages);
+ if (growth<=0) return growth;
+ most_seen_pages += growth;
+ current_pages += growth;
+ printk(KERN_INFO "Granted %dMB new mem by xen. Domain now has %luMB\n",
+ growth>>PAGE_TO_MB_SHIFT, current_pages>>PAGE_TO_MB_SHIFT);
+ }
}
+
return len;
}
static int balloon_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+ int count, int *eof, void *data)
{
- int len;
- len = sprintf(page,"%lu\n",current_pages<<PAGE_SHIFT);
-
- if (len <= off+count) *eof = 1;
- *start = page + off;
- len -= off;
- if (len>count) len = count;
- if (len<0) len = 0;
- return len;
+ int len;
+ len = sprintf(page,"%lu\n",current_pages<<PAGE_SHIFT);
+
+ if (len <= off+count) *eof = 1;
+ *start = page + off;
+ len -= off;
+ if (len>count) len = count;
+ if (len<0) len = 0;
+ return len;
}
static int __init init_module(void)
{
printk(KERN_ALERT "Starting Xen Balloon driver\n");
- max_pages = current_pages = start_info.nr_pages;
+ most_seen_pages = current_pages = min(start_info.nr_pages,max_pfn);
if ( (balloon_pde = create_xen_proc_entry("memory_target", 0644)) == NULL )
{
printk(KERN_ALERT "Unable to create balloon driver proc entry!");
balloon_pde->write_proc = balloon_write;
balloon_pde->read_proc = balloon_read;
+ /*
+ * make a new phys map if mem= says xen can give us memory to grow
+ */
+ if (max_pfn > start_info.nr_pages) {
+ extern unsigned long *phys_to_machine_mapping;
+ unsigned long *newmap;
+ newmap = (unsigned long *)vmalloc(max_pfn * sizeof(unsigned long));
+ phys_to_machine_mapping = memcpy(newmap, phys_to_machine_mapping,
+ start_info.nr_pages * sizeof(unsigned long));
+ }
+
return 0;
}
static char command_line[COMMAND_LINE_SIZE];
char saved_command_line[COMMAND_LINE_SIZE];
-static void __init parse_mem_cmdline (char ** cmdline_p)
+/* parse_mem_cmdline()
+ * returns the value of the mem= boot param converted to pages or 0
+ */
+static int __init parse_mem_cmdline (char ** cmdline_p)
{
char c = ' ', *to = command_line, *from = saved_command_line;
int len = 0;
+ unsigned long long bytes;
+ int mem_param = 0;
/* Save unparsed command line copy for /proc/cmdline */
memcpy(saved_command_line, start_info.cmd_line, COMMAND_LINE_SIZE);
} else if (!memcmp(from+4, "exactmap", 8)) {
from += 8+4;
} else {
- (void)memparse(from+4, &from);
- if (*from == '@')
+ bytes = memparse(from+4, &from);
+ mem_param = bytes>>PAGE_SHIFT;
+ if (*from == '@')
(void)memparse(from+1, &from);
}
}
}
*to = '\0';
*cmdline_p = command_line;
+
+ return mem_param;
}
/*
void __init setup_arch(char **cmdline_p)
{
- unsigned long bootmap_size, start_pfn, max_low_pfn;
+ unsigned long bootmap_size, start_pfn, lmax_low_pfn;
+ int mem_param; /* user specified memory size in pages */
+ int boot_pfn; /* low pages available for bootmem */
extern void hypervisor_callback(void);
extern void failsafe_callback(void);
init_mm.end_data = (unsigned long) &_edata;
init_mm.brk = (unsigned long) &_end;
- parse_mem_cmdline(cmdline_p);
+ /* The mem= kernel command line param overrides the detected amount
+ * of memory. For xenolinux, if this override is larger than detected
+ * memory, then boot using only detected memory and make provisions to
+ * use all of the override value. The hypervisor can give this
+ * domain more memory later on and it will be added to the free
+ * lists at that time. See claim_new_pages() in
+ * arch/xen/drivers/balloon/balloon.c
+ */
+ mem_param = parse_mem_cmdline(cmdline_p);
+ if (!mem_param) mem_param = start_info.nr_pages;
#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
/*
* Determine low and high memory ranges:
*/
- max_low_pfn = max_pfn = start_info.nr_pages;
- if (max_low_pfn > MAXMEM_PFN) {
- max_low_pfn = MAXMEM_PFN;
+ lmax_low_pfn = max_pfn = mem_param;
+ if (lmax_low_pfn > MAXMEM_PFN) {
+ lmax_low_pfn = MAXMEM_PFN;
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n",
* bootstrap page table. We are guaranteed to get >=512kB unused 'padding'
* for our own use after all bootstrap elements (see hypervisor-if.h).
*/
- bootmap_size = init_bootmem(start_pfn, max_low_pfn);
- free_bootmem(0, PFN_PHYS(max_low_pfn));
+ boot_pfn = min((int)start_info.nr_pages,lmax_low_pfn);
+ bootmap_size = init_bootmem(start_pfn,boot_pfn);
+ free_bootmem(0, PFN_PHYS(boot_pfn));
reserve_bootmem(__pa(&_stext),
PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1 -
__pa(&_stext));
+ /* init_bootmem() set the global max_low_pfn to boot_pfn. Now max_low_pfn
+ * can be set to the override value.
+ */
+ max_low_pfn = lmax_low_pfn;
+
+
+
#ifdef CONFIG_BLK_DEV_INITRD
if ( start_info.mod_start != 0 )
{